import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
# To ignore unnecessary warnings
import warnings
warnings.filterwarnings("ignore")
df=pd.read_csv('Loan_Modelling.csv')
df.head()
df.tail()
df.info()
Observations: All the columns are numerical.
df.shape[0],df.shape[1]
Observations: There are 5000 rows and 14 columns in the dataset
df.isnull().sum()
Observations: There are no null values in the dataset
df.duplicated().sum()
Observations: There are no duplicate values in the dataset
# checking the number of unique values in each column
df.nunique()
df.drop(columns=["ID", "ZIPCode"], inplace=True)
df.describe().T
Observations:
The below functions need to be defined to carry out the EDA.
def histogram_boxplot(data, feature, figsize=(15, 10), kde=False, bins=None):
"""
Boxplot and histogram combined
data: dataframe
feature: dataframe column
figsize: size of figure (default (15,10))
kde: whether to show the density curve (default False)
bins: number of bins for histogram (default None)
"""
f2, (ax_box2, ax_hist2) = plt.subplots(
nrows=2, # Number of rows of the subplot grid= 2
sharex=True, # x-axis will be shared among all subplots
gridspec_kw={"height_ratios": (0.25, 0.75)},
figsize=figsize,
) # creating the 2 subplots
sns.boxplot(
data=data, x=feature, ax=ax_box2, showmeans=True, color="violet"
) # boxplot will be created and a triangle will indicate the mean value of the column
sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins
) if bins else sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2
) # For histogram
ax_hist2.axvline(
data[feature].mean(), color="green", linestyle="--"
) # Add mean to the histogram
ax_hist2.axvline(
data[feature].median(), color="black", linestyle="-"
) # Add median to the histogram
# function to create labeled barplots
def labeled_barplot(data, feature, perc=False, n=None):
"""
Barplot with percentage at the top
data: dataframe
feature: dataframe column
perc: whether to display percentages instead of count (default is False)
n: displays the top n category levels (default is None, i.e., display all levels)
"""
total = len(data[feature]) # length of the column
count = data[feature].nunique()
if n is None:
plt.figure(figsize=(count + 1, 5))
else:
plt.figure(figsize=(n + 1, 5))
plt.xticks(rotation=90, fontsize=15)
ax = sns.countplot(
data=data,
x=feature,
hue=feature,
palette="Paired",
order=data[feature].value_counts().index[:n].sort_values(),
)
for p in ax.patches:
if perc == True:
label = "{:.1f}%".format(
100 * p.get_height() / total
) # percentage of each class of the category
else:
label = p.get_height() # count of each level of the category
x = p.get_x() + p.get_width() / 2 # width of the plot
y = p.get_height() # height of the plot
ax.annotate(
label,
(x, y),
ha="center",
va="center",
size=12,
xytext=(0, 5),
textcoords="offset points",
) # annotate the percentage
plt.show() # show the plot
for col in df.columns:
histogram_boxplot(df, col)
Observations:
numeric_cols = ['Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Mortgage']
# Plot histograms
plt.figure(figsize=(15,12))
for i, col in enumerate(numeric_cols, 1):
plt.subplot(4, 4, i) # adjust rows/columns as needed
sns.histplot(df[col], kde=True, bins=30, color='skyblue')
plt.title(col)
plt.tight_layout()
plt.show()
Observations:
plt.figure(figsize=(15,12))
for i, col in enumerate(numeric_cols, 1):
plt.subplot(4, 4, i)
sns.boxplot(x=df[col], color='lightgreen')
plt.title(col)
plt.tight_layout()
plt.show()
Observations:
for col in df.columns.tolist()[1:]:
labeled_barplot(df, col, perc=True)
Observations:
sns.boxplot(x='Family', y='CCAvg', data=df)
plt.title('Credit Card Spending by Family Size')
plt.show()
Observations:
plt.figure(figsize=(15, 7))
sns.heatmap(df[numeric_cols].corr(), annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="Spectral")
plt.show()
Observations:
sns.boxplot(x='Personal_Loan', y='Income', data=df)
plt.title('Income distribution by Personal Loan')
plt.show()
Observations:
# Cross-tab
pd.crosstab(df['Personal_Loan'], df['CreditCard'])
# Stacked barplot
ct = pd.crosstab(df['Personal_Loan'], df['CreditCard'])
ct.plot(kind='bar', stacked=True, figsize=(8,6))
plt.title('Personal Loan vs CreditCard')
plt.show()
Observations:
# outlier detection using boxplot
plt.figure(figsize=(15, 12))
for i, variable in enumerate(numeric_cols):
plt.subplot(3, 4, i + 1)
plt.boxplot(df[variable], whis=1.5)
plt.tight_layout()
plt.title(variable)
plt.show()
Observations
df['Age_Group'] = pd.cut(df['Age'], bins=[20,30,40,50,60,70], labels=['20-30','30-40','40-50','50-60','60-70'])
df = pd.get_dummies(df, columns=['Education','Age_Group'], drop_first=True)
X_numeric = ['Age','Experience','Income','Family','CCAvg','Mortgage']
scaler = StandardScaler()
df[X_numeric] = scaler.fit_transform(df[X_numeric])
X = df.drop('Personal_Loan', axis=1)
y = df['Personal_Loan']
# X = features, y = target (Personal_Loan)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print("Shape of Training set : ", X_train.shape)
print("Shape of test set : ", X_test.shape)
print("Percentage of classes in training set:")
print(y_train.value_counts(normalize=True))
print("Percentage of classes in test set:")
print(y_test.value_counts(normalize=True))
Observations
# defining a function to compute different metrics to check performance of a classification model built using sklearn
def model_performance_classification_sklearn(model, predictors, target):
"""
Function to compute different metrics to check classification model performance
model: classifier
predictors: independent variables
target: dependent variable
"""
# predicting using the independent variables
pred = model.predict(predictors)
acc = accuracy_score(target, pred) # to compute Accuracy
recall = recall_score(target, pred) # to compute Recall
precision = precision_score(target, pred) # to compute Precision
f1 = f1_score(target, pred) # to compute F1-score
# creating a dataframe of metrics
df_perf = pd.DataFrame(
{"Accuracy": acc, "Recall": recall, "Precision": precision, "F1": f1,},
index=[0],
)
return df_perf
def confusion_matrix_sklearn(model, predictors, target):
"""
To plot the confusion_matrix with percentages
model: classifier
predictors: independent variables
target: dependent variable
"""
y_pred = model.predict(predictors)
cm = confusion_matrix(target, y_pred)
labels = np.asarray(
[
["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
for item in cm.flatten()
]
).reshape(2, 2)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=labels, fmt="")
plt.ylabel("True label")
plt.xlabel("Predicted label")
model1 = DecisionTreeClassifier(random_state=1)
model1.fit(X_train, y_train)
proba = model1.predict_proba(X_train)
print(proba)
confusion_matrix_sklearn(model1, X_train, y_train)
decision_tree_default_perf_train = model_performance_classification_sklearn(
model1, X_train, y_train
)
decision_tree_default_perf_train
Observations:
confusion_matrix_sklearn(model1, X_test, y_test)
decision_tree_default_perf_test = model_performance_classification_sklearn(
model1, X_test, y_test
)
decision_tree_default_perf_test
Observations:
# Define the parameters of the tree to iterate over
max_depth_values = np.arange(2, 7, 2)
max_leaf_nodes_values = [50, 75, 150, 250]
min_samples_split_values = [10, 30, 50, 70]
# Initialize variables to store the best model and its performance
best_estimator = None
best_score_diff = float('inf')
best_test_score = 0.0
# Iterate over all combinations of the specified parameter values
for max_depth in max_depth_values:
for max_leaf_nodes in max_leaf_nodes_values:
for min_samples_split in min_samples_split_values:
# Initialize the tree with the current set of parameters
estimator = DecisionTreeClassifier(
max_depth=max_depth,
max_leaf_nodes=max_leaf_nodes,
min_samples_split=min_samples_split,
class_weight='balanced',
random_state=42
)
# Fit the model to the training data
estimator.fit(X_train, y_train)
# Make predictions on the training and test sets
y_train_pred = estimator.predict(X_train)
y_test_pred = estimator.predict(X_test)
# Calculate recall scores for training and test sets
train_recall_score = recall_score(y_train, y_train_pred)
test_recall_score = recall_score(y_test, y_test_pred)
# Calculate the absolute difference between training and test recall scores
score_diff = abs(train_recall_score - test_recall_score)
# Update the best estimator and best score if the current one has a smaller score difference
if (score_diff < best_score_diff) & (test_recall_score > best_test_score):
best_score_diff = score_diff
best_test_score = test_recall_score
best_estimator = estimator
# Print the best parameters
print("Best parameters found:")
print(f"Max depth: {best_estimator.max_depth}")
print(f"Max leaf nodes: {best_estimator.max_leaf_nodes}")
print(f"Min samples split: {best_estimator.min_samples_split}")
print(f"Best test recall score: {best_test_score}")
# creating an instance of the best model
model2 = best_estimator
# fitting the best model to the training data
model2.fit(X_train, y_train)
confusion_matrix_sklearn(model2, X_train, y_train)
decision_tree_tune_perf_train = model_performance_classification_sklearn(
model2, X_train, y_train
)
decision_tree_tune_perf_train
Observations:
confusion_matrix_sklearn(model2, X_test, y_test)
decision_tree_tune_perf_test = model_performance_classification_sklearn(
model2, X_test, y_test
)
decision_tree_tune_perf_test
Observations:
feature_names = list(X_train.columns)
importances = model2.feature_importances_
indices = np.argsort(importances)
for name, score in zip(feature_names, importances):
print(f"{name}: {score:.3f}")
feature_names = list(X_train.columns)
importances = model2.feature_importances_
# sort indices in descending order
indices = np.argsort(importances)[::-1]
for i in indices:
print(f"{feature_names[i]}: {importances[i]:.3f}")
plt.figure(figsize=(6,4))
plt.barh(range(len(importances)), importances[indices], align="center")
plt.yticks(range(len(importances)), [feature_names[i] for i in indices])
plt.xlabel("Feature Importance")
plt.title("Decision Tree Feature Importances")
plt.show()
plt.figure(figsize=(20, 10))
out = tree.plot_tree(
model2,
feature_names=feature_names,
filled=True,
fontsize=9,
node_ids=False,
class_names=None,
)
# below code will add arrows to the decision tree split if they are missing
for o in out:
arrow = o.arrow_patch
if arrow is not None:
arrow.set_edgecolor("black")
arrow.set_linewidth(1)
plt.show()
clf = DecisionTreeClassifier(random_state=1, class_weight="balanced")
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = abs(path.ccp_alphas), path.impurities
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(ccp_alphas[:-1], impurities[:-1], marker="o", drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")
plt.show()
clfs = []
for ccp_alpha in ccp_alphas:
clf = DecisionTreeClassifier(
random_state=1, ccp_alpha=ccp_alpha, class_weight="balanced"
)
clf.fit(X_train, y_train)
clfs.append(clf)
print(
"Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
clfs[-1].tree_.node_count, ccp_alphas[-1]
)
)
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]
node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]
fig, ax = plt.subplots(2, 1, figsize=(10, 7))
ax[0].plot(ccp_alphas, node_counts, marker="o", drawstyle="steps-post")
ax[0].set_xlabel("alpha")
ax[0].set_ylabel("number of nodes")
ax[0].set_title("Number of nodes vs alpha")
ax[1].plot(ccp_alphas, depth, marker="o", drawstyle="steps-post")
ax[1].set_xlabel("alpha")
ax[1].set_ylabel("depth of tree")
ax[1].set_title("Depth vs alpha")
fig.tight_layout()
recall_train = []
for clf in clfs:
pred_train = clf.predict(X_train)
values_train = recall_score(y_train, pred_train)
recall_train.append(values_train)
recall_test = []
for clf in clfs:
pred_test = clf.predict(X_test)
values_test = recall_score(y_test, pred_test)
recall_test.append(values_test)
train_scores = [clf.score(X_train, y_train) for clf in clfs]
test_scores = [clf.score(X_test, y_test) for clf in clfs]
fig, ax = plt.subplots(figsize=(15, 5))
ax.set_xlabel("alpha")
ax.set_ylabel("Recall")
ax.set_title("Recall vs alpha for training and testing sets")
ax.plot(
ccp_alphas, recall_train, marker="o", label="train", drawstyle="steps-post",
)
ax.plot(ccp_alphas, recall_test, marker="o", label="test", drawstyle="steps-post")
ax.legend()
plt.show()
# creating the model where we get highest train and test recall
index_best_model = np.argmax(recall_test)
best_model = clfs[index_best_model]
print(best_model)
model3 = best_model
confusion_matrix_sklearn(model3, X_train, y_train)
decision_tree_post_perf_train = model_performance_classification_sklearn(
model3, X_train, y_train
)
decision_tree_post_perf_train
Observations:
confusion_matrix_sklearn(model3, X_test, y_test)
decision_tree_post_test = model_performance_classification_sklearn(
model3, X_test, y_test
)
decision_tree_post_test
Observations:
plt.figure(figsize=(20, 10))
out = tree.plot_tree(
model3,
feature_names=feature_names,
filled=True,
fontsize=9,
node_ids=False,
class_names=None,
)
for o in out:
arrow = o.arrow_patch
if arrow is not None:
arrow.set_edgecolor("black")
arrow.set_linewidth(1)
plt.show()
importances = model3.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
# training performance comparison
models_train_comp_df = pd.concat(
[
decision_tree_default_perf_train.T,
decision_tree_tune_perf_train.T,
decision_tree_post_perf_train.T,
],
axis=1,
)
models_train_comp_df.columns = [
"Decision Tree (sklearn default)",
"Decision Tree (Pre-Pruning)",
"Decision Tree (Post-Pruning)",
]
print("Training performance comparison:")
models_train_comp_df
# testing performance comparison
models_test_comp_df = pd.concat(
[
decision_tree_default_perf_test.T,
decision_tree_tune_perf_test.T,
decision_tree_post_test.T,
],
axis=1,
)
models_test_comp_df.columns = [
"Decision Tree (sklearn default)",
"Decision Tree (Pre-Pruning)",
"Decision Tree (Post-Pruning)",
]
print("Test set performance comparison:")
models_test_comp_df
Observations:
Conclusions:
Recommendations: